import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import display, HTML
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from bow_tfidf_pca import create_bow, get_tfidf, get_pca
import itertools
sns.set()
OHCO = ['book_id', 'chap_id', 'para_num', 'sent_num', 'token_num']
emo_cols = "anger anticipation disgust fear joy sadness surprise trust polarity".split()
SENTS = OHCO[:4]
PARAS = OHCO[:3]
CHAPS = OHCO[:2]
BOOKS = OHCO[:1]
# read in csv files
dickens_LIB = pd.read_csv('dickens_pre_LIB.csv')
dickens_CORPUS = pd.read_csv('dickens_pre_CORPUS.csv')
twain_LIB = pd.read_csv('twain_pre_LIB.csv')
twain_CORPUS = pd.read_csv('twain_pre_CORPUS.csv')
LIB Table¶# combined LIB
LIB = pd.concat([dickens_LIB, twain_LIB]).set_index(BOOKS).sort_index()
LIB['label'] = LIB.apply(lambda x: "{}_{}_{}".format(x.author, x.title.replace(' ', '_'), x.name), 1)
LIB
| source_file_path | title | chap_regex | author | type | year | decade | n_chaps | book_len | label | |
|---|---|---|---|---|---|---|---|---|---|---|
| book_id | ||||||||||
| 70 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 | twain_what_is_man_70 |
| 74 | Twain/74-the_adventures_of_tom_sawyer.txt | the adventures of tom sawyer | ^\s*CHAPTER\s*[IVXLCM]+$ | twain | novel | 1876 | 1870 | 35 | 70276 | twain_the_adventures_of_tom_sawyer_74 |
| 76 | Twain/76-the_adventures_of_huckleberry_finn.txt | the adventures of huckleberry finn | ^\s*CHAPTER\s*(?:[IVXLCM]+\.|THE LAST)$ | twain | novel | 1884 | 1880 | 43 | 111908 | twain_the_adventures_of_huckleberry_finn_76 |
| 86 | Twain/86-a_connecticut_yankee_in_king_arthurs_... | a connecticut yankee in king arthurs court | ^\s*(?:PREFACE|A WORD OF EXPLANATION|THE STRAN... | twain | novel | 1889 | 1880 | 47 | 119100 | twain_a_connecticut_yankee_in_king_arthurs_cou... |
| 91 | Twain/91-tom_sawyer_abroad.txt | tom sawyer abroad | CHAPTER\s[IVXLCM]+\. | twain | novel | 1894 | 1890 | 13 | 33969 | twain_tom_sawyer_abroad_91 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 35536 | Dickens/35536-the_poems_and_verses_of_charles_... | the poems and verses of charles dickens | THE VILLAGE COQUETTES$|THE LAMPLIGHTER$|SONGS ... | dickens | stories | 1885 | 1880 | 13 | 10952 | dickens_the_poems_and_verses_of_charles_dicken... |
| 60900 | Twain/60900-merry_tales.txt | merry tales | ^THE PRIVATE HISTORY OF A CAMPAIGN THAT FAILED... | twain | stories | 1892 | 1890 | 6 | 36846 | twain_merry_tales_60900 |
| 61522 | Twain/61522-the_1000000_bank_note.txt | the 1000000 bank note | ^_THE £1,000,000 BANK-NOTE_$|^_METNAL TELEGRAP... | twain | stories | 1893 | 1890 | 6 | 65207 | twain_the_1000000_bank_note_61522 |
| 62636 | Twain/62636-to_the_person_sitting_in_darkness.txt | to the person sitting in darkness | ^Extending the Blessings | twain | non-fiction | 1901 | 1900 | 1 | 4719 | twain_to_the_person_sitting_in_darkness_62636 |
| 62739 | Twain/62739-king_leopolds_soliloquy.txt | king leopolds soliloquy | ^(\[_Throws down pamphlets which he has|Footnote) | twain | stories | 1905 | 1900 | 6 | 12797 | twain_king_leopolds_soliloquy_62739 |
95 rows × 10 columns
CORPUS Table¶# combined corpus
CORPUS = pd.concat([dickens_CORPUS, twain_CORPUS]).set_index(OHCO)
# remove NaN values
CORPUS = CORPUS[~CORPUS.term_str.isna()]
CORPUS
| pos_tuple | pos | token_str | term_str | |||||
|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | para_num | sent_num | token_num | ||||
| 98 | 1 | 0 | 0 | 0 | ('The', 'DT') | DT | The | the |
| 1 | ('Period', 'NN') | NN | Period | period | ||||
| 1 | 0 | 0 | ('It', 'PRP') | PRP | It | it | ||
| 1 | ('was', 'VBD') | VBD | was | was | ||||
| 2 | ('the', 'DT') | DT | the | the | ||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 6 | 13 | 0 | 8 | ("Leopold's", 'NNP') | NNP | Leopold's | leopolds |
| 9 | ('Soliloquy,', 'NNP') | NNP | Soliloquy, | soliloquy | ||||
| 10 | ('by', 'IN') | IN | by | by | ||||
| 11 | ('Mark', 'NNP') | NNP | Mark | mark | ||||
| 12 | ('Twain', 'NNP') | NNP | Twain | twain |
7940320 rows × 4 columns
VOCAB Table¶VOCAB = pd.read_csv('full_VOCAB.csv').set_index('term_str')
VOCAB.head()
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||
| the | 1 | 418963 | 3 | 0.052764 | 4.244302 | DT | 22 | {'PRP', 'FW', 'RB', 'NN', 'JJS', 'NNP', 'VBZ',... | 1 | the | the | the | 1 | 418963 | 418963 | 0.001204 | 0.001261 | 2288 | 0.001261 | 2.884130 |
| and | 2 | 310105 | 3 | 0.039054 | 4.678368 | CC | 20 | {'PRP', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | and | and | and | 2 | 620210 | 620210 | 0.001852 | 0.002522 | 2286 | 0.002522 | 5.765737 |
| of | 3 | 218996 | 2 | 0.027580 | 5.180221 | IN | 19 | {'PRP', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | of | of | of | 3 | 656988 | 656988 | 0.000965 | 0.001891 | 2287 | 0.001891 | 4.325249 |
| to | 4 | 206700 | 2 | 0.026032 | 5.263587 | TO | 23 | {'WDT', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | to | to | to | 4 | 826800 | 826800 | 0.000924 | 0.001891 | 2287 | 0.001891 | 4.325249 |
| a | 5 | 189310 | 1 | 0.023842 | 5.390375 | DT | 21 | {'RBR', 'PRP', 'FW', 'RB', 'NN', 'NNP', 'VBZ',... | 1 | a | a | a | 5 | 946550 | 946550 | 0.001707 | 0.003785 | 2284 | 0.003785 | 8.644820 |
# CHAPTERS = CORPUS.groupby(OHCO[:2]+['term_str']).term_str.count().unstack()
# VOCAB['df'] = CHAPTERS.count()
# VOCAB['dfidf'] = VOCAB.df * np.log2(len(CHAPTERS)/VOCAB.df)
VOCAB.head()
| term_rank | n | n_chars | p | i | max_pos | n_pos | cat_pos | stop | stem_porter | stem_snowball | stem_lancaster | term_rank2 | zipf_k | zipf_k2 | tfidf_mean_chap_max | tfidf_max_chap_max | df | idf | dfidf | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||
| the | 1 | 418963 | 3 | 0.052764 | 4.244302 | DT | 22 | {'PRP', 'FW', 'RB', 'NN', 'JJS', 'NNP', 'VBZ',... | 1 | the | the | the | 1 | 418963 | 418963 | 0.001204 | 0.001261 | 2288 | 0.001261 | 2.884130 |
| and | 2 | 310105 | 3 | 0.039054 | 4.678368 | CC | 20 | {'PRP', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | and | and | and | 2 | 620210 | 620210 | 0.001852 | 0.002522 | 2286 | 0.002522 | 5.765737 |
| of | 3 | 218996 | 2 | 0.027580 | 5.180221 | IN | 19 | {'PRP', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | of | of | of | 3 | 656988 | 656988 | 0.000965 | 0.001891 | 2287 | 0.001891 | 4.325249 |
| to | 4 | 206700 | 2 | 0.026032 | 5.263587 | TO | 23 | {'WDT', 'FW', 'RB', 'PDT', 'NN', 'NNP', 'VBZ',... | 1 | to | to | to | 4 | 826800 | 826800 | 0.000924 | 0.001891 | 2287 | 0.001891 | 4.325249 |
| a | 5 | 189310 | 1 | 0.023842 | 5.390375 | DT | 21 | {'RBR', 'PRP', 'FW', 'RB', 'NN', 'NNP', 'VBZ',... | 1 | a | a | a | 5 | 946550 | 946550 | 0.001707 | 0.003785 | 2284 | 0.003785 | 8.644820 |
SALEX = pd.read_csv('../salex/salex_nrc.csv').set_index('term_str')
SALEX.columns = [col.replace('nrc_','') for col in SALEX.columns]
SALEX
| anger | anticipation | disgust | fear | joy | negative | positive | sadness | surprise | trust | polarity | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||||
| abandon | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| abandoned | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| abandonment | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | -1 |
| abduction | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | -1 |
| aberration | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| young | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| youth | 1 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| zeal | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 1 |
| zealous | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 |
| zest | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 |
3688 rows × 11 columns
BOW = create_bow(CORPUS, CHAPS)
BOW
| n | |||
|---|---|---|---|
| book_id | chap_id | term_str | |
| 70 | 1 | 1835 | 1 |
| 1910 | 1 | ||
| a | 2 | ||
| alphabet | 1 | ||
| as | 2 | ||
| ... | ... | ... | ... |
| 62739 | 6 | will | 1 |
| with | 1 | ||
| would | 1 | ||
| year | 1 | ||
| you | 1 |
2307752 rows × 1 columns
COMBO table¶COMBO = CORPUS.join(LIB).join(SALEX, on='term_str').join(BOW, on=OHCO[:2] + ['term_str'])
COMBO = COMBO.drop(['n'], axis=1)
COMBO = COMBO.sort_index()
COMBO
| pos_tuple | pos | token_str | term_str | source_file_path | title | chap_regex | author | type | year | ... | anticipation | disgust | fear | joy | negative | positive | sadness | surprise | trust | polarity | |||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | para_num | sent_num | token_num | |||||||||||||||||||||
| 70 | 1 | 1 | 0 | 0 | ('By', 'IN') | IN | By | by | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | ('Mark', 'NNP') | NNP | Mark | mark | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||||
| 2 | ('Twain', 'NNP') | NNP | Twain | twain | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||||
| 2 | 0 | 0 | ('(Samuel', 'JJ') | JJ | (Samuel | samuel | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||
| 1 | ('Langhorne', 'NNP') | NNP | Langhorne | langhorne | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 62739 | 6 | 13 | 0 | 8 | ("Leopold's", 'NNP') | NNP | Leopold's | leopolds | Twain/62739-king_leopolds_soliloquy.txt | king leopolds soliloquy | ^(\[_Throws down pamphlets which he has|Footnote) | twain | stories | 1905 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 9 | ('Soliloquy,', 'NNP') | NNP | Soliloquy, | soliloquy | Twain/62739-king_leopolds_soliloquy.txt | king leopolds soliloquy | ^(\[_Throws down pamphlets which he has|Footnote) | twain | stories | 1905 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||||
| 10 | ('by', 'IN') | IN | by | by | Twain/62739-king_leopolds_soliloquy.txt | king leopolds soliloquy | ^(\[_Throws down pamphlets which he has|Footnote) | twain | stories | 1905 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||||
| 11 | ('Mark', 'NNP') | NNP | Mark | mark | Twain/62739-king_leopolds_soliloquy.txt | king leopolds soliloquy | ^(\[_Throws down pamphlets which he has|Footnote) | twain | stories | 1905 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ||||
| 12 | ('Twain', 'NNP') | NNP | Twain | twain | Twain/62739-king_leopolds_soliloquy.txt | king leopolds soliloquy | ^(\[_Throws down pamphlets which he has|Footnote) | twain | stories | 1905 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7940320 rows × 25 columns
books = COMBO.groupby(OHCO[:1])[emo_cols].mean().join(LIB[['label', 'type']])
books.style.background_gradient(cmap='GnBu', axis=None)
| anger | anticipation | disgust | fear | joy | sadness | surprise | trust | polarity | label | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | |||||||||||
| 70 | 0.218018 | 0.293178 | 0.162472 | 0.288144 | 0.355147 | 0.270266 | 0.182086 | 0.449922 | 0.158132 | twain_what_is_man_70 | non-fiction |
| 74 | 0.270876 | 0.273164 | 0.230840 | 0.324640 | 0.304049 | 0.335392 | 0.179364 | 0.328300 | -0.097232 | twain_the_adventures_of_tom_sawyer_74 | novel |
| 76 | 0.290509 | 0.304715 | 0.224007 | 0.322215 | 0.380894 | 0.294832 | 0.225242 | 0.407453 | 0.027177 | twain_the_adventures_of_huckleberry_finn_76 | novel |
| 86 | 0.245507 | 0.278733 | 0.168067 | 0.315837 | 0.321138 | 0.272786 | 0.195217 | 0.427020 | 0.085197 | twain_a_connecticut_yankee_in_king_arthurs_court_86 | novel |
| 91 | 0.251055 | 0.299578 | 0.196203 | 0.281294 | 0.355134 | 0.288326 | 0.194093 | 0.429677 | 0.061885 | twain_tom_sawyer_abroad_91 | novel |
| 93 | 0.270936 | 0.261084 | 0.252874 | 0.343186 | 0.314450 | 0.344828 | 0.170772 | 0.399836 | -0.055008 | twain_tom_sawyer_detective_93 | novel |
| 98 | 0.261961 | 0.280253 | 0.179526 | 0.333255 | 0.307810 | 0.295380 | 0.177181 | 0.398687 | 0.042448 | dickens_a_tale_of_two_cities_98 | novel |
| 102 | 0.283348 | 0.308050 | 0.189189 | 0.338274 | 0.340308 | 0.325196 | 0.210695 | 0.380994 | -0.007847 | twain_the_tragedy_of_puddnhead_wilson_102 | novel |
| 119 | 0.219272 | 0.307659 | 0.153502 | 0.279690 | 0.367530 | 0.283186 | 0.202229 | 0.405878 | 0.102808 | twain_a_tramp_abroad_119 | non-fiction |
| 142 | 0.213292 | 0.326036 | 0.155121 | 0.277717 | 0.400625 | 0.284128 | 0.183894 | 0.426114 | 0.138389 | twain_the_30000_bequest_and_other_stories_142 | stories |
| 245 | 0.230486 | 0.278304 | 0.164843 | 0.315181 | 0.315796 | 0.287892 | 0.185618 | 0.393362 | 0.040811 | twain_life_on_the_mississippi_245 | non-fiction |
| 564 | 0.205431 | 0.291354 | 0.160950 | 0.259557 | 0.368703 | 0.250447 | 0.189532 | 0.434262 | 0.160057 | dickens_the_mystery_of_edwin_drood_564 | novel |
| 580 | 0.200398 | 0.265022 | 0.163345 | 0.255339 | 0.310942 | 0.238316 | 0.176851 | 0.458030 | 0.169971 | dickens_the_pickwick_papers_580 | novel |
| 588 | 0.187770 | 0.295115 | 0.148887 | 0.273513 | 0.365238 | 0.248588 | 0.172483 | 0.419741 | 0.156198 | dickens_master_humphreys_clock_588 | stories |
| 644 | 0.207738 | 0.251958 | 0.185168 | 0.322893 | 0.326578 | 0.337632 | 0.160755 | 0.347766 | -0.011055 | dickens_the_haunted_man_and_the_ghosts_bargain_644 | stories |
| 650 | 0.239856 | 0.283138 | 0.192741 | 0.297115 | 0.344905 | 0.314022 | 0.173805 | 0.351668 | -0.000225 | dickens_pictures_from_italy_650 | non-fiction |
| 653 | 0.205102 | 0.304008 | 0.151484 | 0.262884 | 0.407080 | 0.266007 | 0.187923 | 0.417491 | 0.160333 | dickens_the_chimes_653 | novel |
| 675 | 0.232051 | 0.270353 | 0.173878 | 0.296795 | 0.334615 | 0.299038 | 0.148718 | 0.383173 | 0.068109 | dickens_american_notes_675 | non-fiction |
| 676 | 0.171991 | 0.289137 | 0.107029 | 0.228967 | 0.425985 | 0.200213 | 0.179446 | 0.498935 | 0.308307 | dickens_the_battle_of_life_676 | novel |
| 699 | 0.313401 | 0.271915 | 0.223883 | 0.351968 | 0.293852 | 0.289253 | 0.158691 | 0.394870 | -0.033171 | dickens_a_childs_history_of_england_699 | non-fiction |
| 700 | 0.192560 | 0.319235 | 0.150475 | 0.250349 | 0.394891 | 0.269961 | 0.176438 | 0.424274 | 0.167783 | dickens_the_old_curiosity_shop_700 | novel |
| 730 | 0.241240 | 0.279724 | 0.204232 | 0.297343 | 0.311220 | 0.277264 | 0.180512 | 0.375492 | 0.029429 | dickens_oliver_twist_730 | novel |
| 766 | 0.182965 | 0.306261 | 0.140980 | 0.237849 | 0.398976 | 0.259059 | 0.179921 | 0.464779 | 0.209489 | dickens_david_copperfield_766 | novel |
| 786 | 0.209346 | 0.290654 | 0.173832 | 0.257009 | 0.344626 | 0.272897 | 0.191589 | 0.439486 | 0.139252 | dickens_hard_times_786 | novel |
| 807 | 0.248434 | 0.258873 | 0.189979 | 0.300626 | 0.313152 | 0.323591 | 0.187891 | 0.436326 | 0.025052 | dickens_hunted_down_807 | stories |
| 809 | 0.197531 | 0.315376 | 0.158249 | 0.210999 | 0.402918 | 0.223345 | 0.166105 | 0.398429 | 0.156004 | dickens_holiday_romance_809 | stories |
| 810 | 0.197461 | 0.272214 | 0.179126 | 0.270804 | 0.332863 | 0.263752 | 0.184767 | 0.506347 | 0.159379 | dickens_george_silvermans_explanation_810 | stories |
| 821 | 0.203769 | 0.307539 | 0.162035 | 0.260036 | 0.387920 | 0.265758 | 0.183515 | 0.419755 | 0.146412 | dickens_dombey_and_sons_821 | novel |
| 824 | 0.151991 | 0.332917 | 0.126630 | 0.208787 | 0.426505 | 0.214324 | 0.181461 | 0.512413 | 0.385605 | dickens_speeches_of_charles_dickens_824 | non-fiction |
| 872 | 0.243876 | 0.278638 | 0.179395 | 0.286383 | 0.331952 | 0.258646 | 0.172010 | 0.408141 | 0.095101 | dickens_reprinted_pieces_872 | stories |
| 882 | 0.217789 | 0.309579 | 0.175582 | 0.255020 | 0.359517 | 0.275902 | 0.183490 | 0.390972 | 0.114004 | dickens_sketches_by_boz_882 | stories |
| 883 | 0.214846 | 0.289870 | 0.196620 | 0.252888 | 0.365953 | 0.248649 | 0.190633 | 0.427678 | 0.120854 | dickens_our_mutual_friend_883 | novel |
| 888 | 0.236819 | 0.279170 | 0.178479 | 0.300778 | 0.312446 | 0.309853 | 0.197061 | 0.378997 | 0.017286 | dickens_the_lazy_tour_of_two_idle_apprentices_888 | stories |
| 912 | 0.167232 | 0.278236 | 0.162870 | 0.231701 | 0.304896 | 0.214251 | 0.168202 | 0.457586 | 0.226854 | dickens_the_mudfog_and_other_sketches_912 | stories |
| 914 | 0.224947 | 0.282716 | 0.195355 | 0.290969 | 0.340839 | 0.284367 | 0.176020 | 0.389531 | 0.072035 | dickens_the_uncommerical_traveller_914 | non-fiction |
| 916 | 0.180591 | 0.315612 | 0.153586 | 0.214346 | 0.423629 | 0.225316 | 0.171308 | 0.447257 | 0.221097 | dickens_sketches_of_young_couples_916 | stories |
| 917 | 0.227854 | 0.263985 | 0.187443 | 0.296693 | 0.314605 | 0.269426 | 0.172587 | 0.406859 | 0.040900 | dickens_barnaby_rudge_917 | stories |
| 918 | 0.128987 | 0.438696 | 0.094968 | 0.165840 | 0.508859 | 0.143161 | 0.356485 | 0.442240 | 0.494685 | dickens_sketches_of_young_gentlemen_918 | stories |
| 922 | 0.226018 | 0.345598 | 0.169514 | 0.249671 | 0.411301 | 0.253614 | 0.170828 | 0.416557 | 0.231275 | dickens_sunday_under_three_heads_922 | non-fiction |
| 927 | 0.156118 | 0.310127 | 0.111814 | 0.189873 | 0.352321 | 0.206751 | 0.229958 | 0.512658 | 0.352321 | dickens_the_lamplighter_927 | stories |
| 967 | 0.213126 | 0.291593 | 0.178251 | 0.256431 | 0.357940 | 0.258443 | 0.194347 | 0.426731 | 0.131689 | dickens_nicholas_nickleby_967 | novel |
| 968 | 0.189850 | 0.288757 | 0.155923 | 0.233625 | 0.367826 | 0.226180 | 0.188389 | 0.456884 | 0.216850 | dickens_martin_chuzzlewit_968 | novel |
| 1023 | 0.192444 | 0.298300 | 0.153908 | 0.235951 | 0.364768 | 0.241490 | 0.185201 | 0.469252 | 0.208067 | dickens_bleak_house_1023 | novel |
| 1044 | 0.162044 | 0.383942 | 0.140146 | 0.192701 | 0.470073 | 0.229197 | 0.214599 | 0.494891 | 0.305109 | twain_extract_from_captain_stormfields_visit_to_Heaven_1044 | stories |
| 1086 | 0.187443 | 0.316652 | 0.158326 | 0.236579 | 0.414923 | 0.256597 | 0.194722 | 0.422202 | 0.239308 | twain_a_horses_tale_1086 | novel |
| 1289 | 0.221018 | 0.254379 | 0.201835 | 0.326939 | 0.247706 | 0.295246 | 0.166806 | 0.386989 | -0.028357 | dickens_three_ghost_stories_1289 | stories |
| 1394 | 0.205917 | 0.294675 | 0.165680 | 0.220118 | 0.396450 | 0.244970 | 0.190533 | 0.469822 | 0.241420 | dickens_the_holly_tree_1394 | stories |
| 1400 | 0.232910 | 0.273758 | 0.194267 | 0.289300 | 0.327208 | 0.285834 | 0.169484 | 0.393153 | 0.026042 | dickens_great_expectations_1400 | novel |
| 1406 | 0.236449 | 0.277570 | 0.148598 | 0.271028 | 0.338318 | 0.273832 | 0.183178 | 0.414019 | 0.156075 | dickens_the_perils_of_certain_english_prisoners_1406 | stories |
| 1407 | 0.200803 | 0.377510 | 0.105756 | 0.191432 | 0.465863 | 0.202142 | 0.265060 | 0.481928 | 0.340027 | dickens_a_message_from_the_sea_1407 | stories |
| 1413 | 0.227513 | 0.266314 | 0.236332 | 0.245150 | 0.315697 | 0.296296 | 0.144621 | 0.403880 | 0.021164 | dickens_tom_tiddlers_ground_1413 | stories |
| 1414 | 0.185866 | 0.299129 | 0.129719 | 0.228461 | 0.369797 | 0.299129 | 0.169409 | 0.420136 | 0.146176 | dickens_somebodys_luggage_1414 | stories |
| 1415 | 0.209302 | 0.377907 | 0.133721 | 0.168605 | 0.494186 | 0.168605 | 0.226744 | 0.395349 | 0.284884 | dickens_doctor_marigold_1415 | stories |
| 1416 | 0.161981 | 0.336011 | 0.140562 | 0.218206 | 0.467202 | 0.218206 | 0.190094 | 0.469880 | 0.211513 | dickens_mrs_lirripers_lodgings_1416 | stories |
| 1421 | 0.212299 | 0.300146 | 0.178624 | 0.222548 | 0.379209 | 0.260615 | 0.181552 | 0.465593 | 0.196193 | dickens_mrs_lirripers_legacy_1421 | stories |
| 1435 | 0.330178 | 0.264477 | 0.246102 | 0.376949 | 0.290089 | 0.299555 | 0.180401 | 0.367483 | -0.008352 | dickens_miscellaneous_papers_1435 | non-fiction |
| 1467 | 0.196781 | 0.310900 | 0.196781 | 0.216533 | 0.428676 | 0.255304 | 0.193855 | 0.412582 | 0.187271 | dickens_some_christmas_stories_1467 | stories |
| 1837 | 0.235863 | 0.259962 | 0.218216 | 0.310816 | 0.306641 | 0.298672 | 0.175142 | 0.390133 | -0.002277 | twain_the_prince_and_the_pauper_1837 | novel |
| 2324 | 0.195901 | 0.315539 | 0.154909 | 0.267874 | 0.370353 | 0.296949 | 0.170639 | 0.407531 | 0.082459 | dickens_a_house_to_let_2324 | stories |
| 2874 | 0.247254 | 0.297983 | 0.159177 | 0.322349 | 0.349111 | 0.271021 | 0.182544 | 0.399641 | 0.072698 | twain_personal_recollections_of_joan_of_arc_vol_1_2874 | non-fiction |
| 2875 | 0.279614 | 0.304604 | 0.161354 | 0.374065 | 0.321921 | 0.302243 | 0.162731 | 0.377410 | -0.008461 | twain_personal_recollections_of_joan_of_arc_vol_2_2875 | non-fiction |
| 2895 | 0.237565 | 0.297570 | 0.173787 | 0.316870 | 0.348188 | 0.273006 | 0.192824 | 0.410299 | 0.076235 | twain_following_the_equator_2895 | non-fiction |
| 3171 | 0.229814 | 0.313221 | 0.197870 | 0.253771 | 0.402839 | 0.266193 | 0.174800 | 0.394854 | 0.186335 | twain_in_defense_of_harriet_shelley_3171 | non-fiction |
| 3172 | 0.210526 | 0.298246 | 0.100877 | 0.250000 | 0.364035 | 0.289474 | 0.241228 | 0.412281 | 0.153509 | twain_fenimore_coopers_literary_offences_3172 | non-fiction |
| 3173 | 0.201908 | 0.271860 | 0.174881 | 0.241653 | 0.365660 | 0.225755 | 0.240064 | 0.492846 | 0.200318 | twain_essays_on_paul_bourget_3173 | non-fiction |
| 3176 | 0.231709 | 0.292387 | 0.174175 | 0.291339 | 0.361009 | 0.285228 | 0.175746 | 0.393924 | 0.080234 | twain_the_innocents_abroad_3176 | non-fiction |
| 3177 | 0.264843 | 0.278414 | 0.183206 | 0.327290 | 0.302481 | 0.293681 | 0.180874 | 0.390373 | 0.003499 | twain_roughing_it_3177 | novel |
| 3178 | 0.238616 | 0.323363 | 0.158949 | 0.257118 | 0.364778 | 0.273799 | 0.194037 | 0.448567 | 0.161058 | twain_the_gilded_age_3178 | novel |
| 3179 | 0.219057 | 0.283045 | 0.200374 | 0.269967 | 0.365950 | 0.279542 | 0.197571 | 0.430406 | 0.120504 | twain_the_american_claimant_3179 | novel |
| 3180 | 0.245833 | 0.313333 | 0.181667 | 0.320000 | 0.355000 | 0.306667 | 0.210000 | 0.386667 | 0.003333 | twain_a_double_barrelled_detective_story_3180 | stories |
| 3181 | 0.244792 | 0.281250 | 0.127604 | 0.304688 | 0.286458 | 0.260417 | 0.221354 | 0.481771 | 0.054688 | twain_the_stolen_white_elephant_3181 | stories |
| 3182 | 0.163432 | 0.317671 | 0.148110 | 0.249234 | 0.411645 | 0.255363 | 0.216547 | 0.450460 | 0.200204 | twain_some_rambling_notes_of_an_idle_excursion_3182 | non-fiction |
| 3183 | 0.252390 | 0.214149 | 0.254302 | 0.296367 | 0.258126 | 0.282983 | 0.133843 | 0.399618 | 0.000000 | twain_the_facts_concerning_the_recent_carnival_of_crime_in_connecticut_3183 | stories |
| 3184 | 0.218612 | 0.308715 | 0.162482 | 0.265879 | 0.374200 | 0.305761 | 0.204333 | 0.413589 | 0.109306 | twain_alonzo_fitz_and_other_stories_3184 | stories |
| 3185 | 0.257292 | 0.259536 | 0.141361 | 0.299925 | 0.275991 | 0.250561 | 0.166043 | 0.445774 | 0.130890 | twain_those_extraordinary_twins_3185 | stories |
| 3186 | 0.257972 | 0.293802 | 0.206019 | 0.336797 | 0.361161 | 0.288785 | 0.198495 | 0.359011 | 0.005374 | twain_the_mysterious_stranger_and_other_stories_3186 | stories |
| 3188 | 0.184236 | 0.319212 | 0.133990 | 0.223251 | 0.400985 | 0.224433 | 0.186010 | 0.519015 | 0.313103 | twain_mark_twain_speeches_3188 | non-fiction |
| 3189 | 0.253662 | 0.267529 | 0.199283 | 0.322686 | 0.310221 | 0.307884 | 0.171705 | 0.382830 | -0.003428 | twain_sketches_new_and_old_3189 | stories |
| 3190 | 0.217327 | 0.259912 | 0.220264 | 0.212922 | 0.348018 | 0.193833 | 0.152717 | 0.436123 | 0.176211 | twain_1601_conversation_as_it_was_by_the_social_fireside_in_the_time_of_the_tudors_3190 | stories |
| 3191 | 0.304450 | 0.259953 | 0.217799 | 0.346604 | 0.271663 | 0.278689 | 0.142857 | 0.377049 | -0.072600 | twain_goldsmiths_friend_abroad_again_3191 | stories |
| 3192 | 0.277589 | 0.280136 | 0.231749 | 0.345501 | 0.295416 | 0.285229 | 0.180815 | 0.376910 | -0.026316 | twain_the_curious_republic_of_gondour_and_other_whimsical_sketches_3192 | stories |
| 3199 | 0.195551 | 0.340659 | 0.147474 | 0.235513 | 0.425451 | 0.247346 | 0.205355 | 0.457096 | 0.244641 | twain_the_letters_of_mark_twain_3199 | non-fiction |
| 3250 | 0.205882 | 0.278075 | 0.152406 | 0.272727 | 0.377005 | 0.286096 | 0.229947 | 0.390374 | 0.131016 | twain_how_to_tell_a_story_and_other_essays_3250 | non-fiction |
| 3251 | 0.241768 | 0.289614 | 0.157473 | 0.302420 | 0.333521 | 0.284548 | 0.179567 | 0.433014 | 0.079229 | twain_the_man_that_corrupted_hadleyburg_and_other_stories_3251 | stories |
| 19337 | 0.211608 | 0.311971 | 0.178356 | 0.266626 | 0.417170 | 0.265417 | 0.206771 | 0.377267 | 0.091898 | dickens_a_christmas_carol_19337 | novel |
| 19484 | 0.297638 | 0.258268 | 0.193701 | 0.346457 | 0.297638 | 0.348031 | 0.196850 | 0.377953 | -0.047244 | twain_editorial_wild_oats_19484 | stories |
| 19987 | 0.204595 | 0.326821 | 0.151922 | 0.262426 | 0.377931 | 0.283214 | 0.202563 | 0.432948 | 0.141294 | twain_chapters_from_my_autobiography_19987 | non-fiction |
| 20795 | 0.172968 | 0.289698 | 0.201796 | 0.199905 | 0.436673 | 0.206994 | 0.189509 | 0.378544 | 0.153119 | dickens_the_cricket_on_the_hearth_20795 | novel |
| 27924 | 0.229795 | 0.266667 | 0.165363 | 0.300559 | 0.334078 | 0.290503 | 0.168715 | 0.384358 | 0.069646 | dickens_mugby_junction_27924 | stories |
| 33077 | 0.220524 | 0.292576 | 0.155022 | 0.334061 | 0.312227 | 0.222707 | 0.141921 | 0.495633 | 0.198690 | twain_the_treaty_with_china_its_provisions_explained_33077 | non-fiction |
| 35536 | 0.231201 | 0.305275 | 0.169473 | 0.241302 | 0.432099 | 0.261504 | 0.198653 | 0.406285 | 0.209877 | dickens_the_poems_and_verses_of_charles_dickens_35536 | stories |
| 60900 | 0.232446 | 0.251816 | 0.167554 | 0.331235 | 0.291525 | 0.280387 | 0.175787 | 0.389346 | -0.010654 | twain_merry_tales_60900 | stories |
| 61522 | 0.210055 | 0.312593 | 0.137382 | 0.293430 | 0.376804 | 0.263813 | 0.186909 | 0.444251 | 0.154803 | twain_the_1000000_bank_note_61522 | stories |
| 62636 | 0.298701 | 0.238961 | 0.140260 | 0.303896 | 0.303896 | 0.272727 | 0.145455 | 0.464935 | 0.111688 | twain_to_the_person_sitting_in_darkness_62636 | non-fiction |
| 62739 | 0.301115 | 0.257745 | 0.231722 | 0.395291 | 0.245353 | 0.322181 | 0.146221 | 0.348203 | -0.105328 | twain_king_leopolds_soliloquy_62739 | stories |
px.bar(books.reset_index().sort_values('polarity'), emo_cols, 'label', orientation='h', height=1000)
px.bar(books.loc[books.type == 'stories'].reset_index().sort_values('polarity'), emo_cols, 'label', orientation='h', height=1000)
px.bar(books.loc[books.label.str.contains('dickens', case = False)].reset_index().sort_values('polarity'), emo_cols, 'label', orientation='h', height=1000)
px.bar(books.loc[books.label.str.contains('twain', case = False)].reset_index().sort_values('polarity'), emo_cols, 'label', orientation='h', height=1000)
books.loc[books.polarity <= 0].shape[0] / books.shape[0]
0.17894736842105263
books.loc[books.polarity <= 0].type.value_counts()
stories 9 novel 4 non-fiction 4 Name: type, dtype: int64
# number of books with neutral / negative polarity by dickens
books.loc[(books.polarity <= 0) & (books.label.str.contains('dickens', case = False))].shape[0]
5
# number of books with neutral / negative polarity by twain
books.loc[(books.polarity <= 0) & (books.label.str.contains('twain', case = False))].shape[0]
12
LIB.author.value_counts()
dickens 50 twain 45 Name: author, dtype: int64
dickens_books = LIB.loc[LIB.author == 'dickens'].index.values
twain_books = LIB.loc[LIB.author == 'twain'].index.values
nrows = int(LIB.loc[LIB.author == 'dickens'].shape[0] / 5)
ncols = 5
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharey = True, figsize = (25, nrows * ncols))
for row in range(nrows):
for col in range(ncols):
index = ncols * row + col
books.loc[dickens_books[index], emo_cols].sort_values().plot.barh(ax = axes[row, col], title = LIB.loc[dickens_books[index], "title"].title());
nrows = int(LIB.loc[LIB.author == 'twain'].shape[0] / 5)
ncols = 5
fig, axes = plt.subplots(nrows = nrows, ncols = ncols, sharey = True, figsize = (25, nrows * ncols))
for row in range(nrows):
for col in range(ncols):
index = ncols * row + col
books.loc[twain_books[index], emo_cols].sort_values().plot.barh(ax = axes[row, col], title = LIB.loc[twain_books[index], "title"].title());
# Just for convenience
class Novels(): pass
novels = Novels()
for idx in LIB.index:
label = LIB.loc[idx].label
label = label.replace(' ', '_')
label = label.replace('–','_')
setattr(novels, label, idx)
VOCAB Table¶# dickens books
two_cities = novels.dickens_a_tale_of_two_cities_98
great_expectations = novels.dickens_great_expectations_1400
copperfield = novels.dickens_david_copperfield_766
twist = novels.dickens_oliver_twist_730
bleak = novels.dickens_bleak_house_1023
mutual = novels.dickens_our_mutual_friend_883
dickens_works = [(LIB.loc[two_cities, "title"], two_cities),
(LIB.loc[great_expectations, "title"], great_expectations),
(LIB.loc[copperfield, "title"], copperfield),
(LIB.loc[twist, "title"], twist),
(LIB.loc[bleak, "title"], bleak),
(LIB.loc[mutual, "title"], mutual)]
# twain books
rough = novels.twain_roughing_it_3177
gilded = novels.twain_the_gilded_age_3178
sawyer = novels.twain_the_adventures_of_tom_sawyer_74
huck = novels.twain_the_adventures_of_huckleberry_finn_76
yankee = novels.twain_a_connecticut_yankee_in_king_arthurs_court_86
puddnhead = novels.twain_the_tragedy_of_puddnhead_wilson_102
twain_works = [(LIB.loc[rough, "title"], rough),
(LIB.loc[gilded, "title"], gilded),
(LIB.loc[sawyer, "title"], sawyer),
(LIB.loc[huck, "title"], huck),
(LIB.loc[yankee, "title"], yankee),
(LIB.loc[puddnhead, "title"], puddnhead)
]
def plot_sentiments(df, title, emo='polarity'):
FIG = dict(figsize=(25, 5), legend=True, fontsize=14, title = title)
df[emo].plot(**FIG)
chaps_df = COMBO.groupby(OHCO[:2])[emo_cols].mean()
chaps_df = chaps_df.join(LIB)
chaps_df.head()
| anger | anticipation | disgust | fear | joy | sadness | surprise | trust | polarity | source_file_path | title | chap_regex | author | type | year | decade | n_chaps | book_len | label | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | |||||||||||||||||||
| 70 | 1 | 0.400000 | 0.600000 | 0.200000 | 0.400000 | 0.600000 | 0.400000 | 0.600000 | 0.600000 | 0.200000 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 | twain_what_is_man_70 |
| 2 | 0.208876 | 0.283432 | 0.174556 | 0.284615 | 0.357396 | 0.270414 | 0.146154 | 0.465680 | 0.146154 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 | twain_what_is_man_70 | |
| 3 | 0.190647 | 0.320144 | 0.118705 | 0.287770 | 0.489209 | 0.334532 | 0.233813 | 0.446043 | 0.212230 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 | twain_what_is_man_70 | |
| 4 | 0.248756 | 0.318408 | 0.199005 | 0.373134 | 0.278607 | 0.318408 | 0.208955 | 0.378109 | 0.084577 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 | twain_what_is_man_70 | |
| 5 | 0.175115 | 0.308756 | 0.129032 | 0.221198 | 0.368664 | 0.276498 | 0.239631 | 0.405530 | 0.142857 | Twain/70-what_is_man.txt | what is man | WHAT IS MAN?|THE DEATH OF JEAN|THE TURNING-POI... | twain | non-fiction | 1906 | 1900 | 17 | 96111 | twain_what_is_man_70 |
for col, book in enumerate(dickens_works):
plot_sentiments(chaps_df.loc[book[1]].reset_index(), book[0].title(), ['polarity', 'joy', 'fear'])
for col, book in enumerate(twain_works):
plot_sentiments(chaps_df.loc[book[1]].reset_index(), book[0].title(), ['polarity', 'joy', 'fear'])
COMBO['html'] = COMBO.fillna(0).apply(lambda x: f"<spdan class='sent{int(np.sign(x['polarity']))}'>{x.term_str}</span>", 1)
COMBO.html.head()
book_id chap_id para_num sent_num token_num
70 1 1 0 0 <spdan class='sent0'>by</span>
1 <spdan class='sent0'>mark</span>
2 <spdan class='sent0'>twain</span>
2 0 0 <spdan class='sent0'>samuel</span>
1 <spdan class='sent0'>langhorne</span>
Name: html, dtype: object
SENTENCES = COMBO.groupby(OHCO[:-1])[emo_cols].mean() #.term_str.count().to_frame('n').join(LIB).join(SALEX)
SENTENCES['html_str'] = COMBO.groupby(OHCO[:-1]).html.apply(lambda x: x.str.cat(sep=' '))
def sample_sentences(df, sample_size=10, emo='polarity'):
rows = []
sample = df.dropna().sample(sample_size).index
for idx in sample:
valence = round(df.loc[idx, emo], 4)
id_label = ' '.join([str(i) for i in idx]).upper()
t = 0
if valence > t: color = '#ccffcc'
elif valence < t: color = '#ffcccc'
else: color = '#f2f2f2'
z = 0
rows.append("""
<tr style="background-color:{0};padding:.5rem 1rem;font-size:110%;">
<td style="width:20%;">{1}</td>
<td style="width:70%;">{2}</td>
<td>{3}</td>
</tr>
""".format(color, id_label, df.loc[idx, 'html_str'], valence))
css = """
#sample1 td {font-size:110%;vertical-align:top;text-align:left;}
#sample1 th {font-size:120%;vertical-align:top;text-align:left;}
.sent-1 {color:red;font-weight:bold;}
.sent1 {color:green;font-weight:bold;}
"""
display(HTML(f'<style>{css}</style>'))
display(HTML('<table id="sample1"><tr><th>Sentence</th><th>ID</th><th>Sentiment</th></tr>'+''.join(rows)+'</table>'))
sample_sentences(SENTENCES.loc[great_expectations])
| Sentence | ID | Sentiment |
|---|---|---|
| 11 123 1 | 0.0 | |
| 40 54 2 | -1.0 | |
| 45 6 1 | 0.0 | |
| 55 26 0 | 1.0 | |
| 4 20 0 | 0.0 | |
| 2 60 2 | -1.0 | |
| 2 22 1 | 0.0 | |
| 5 20 0 | 0.3333 | |
| 26 46 0 | -1.0 | |
| 22 42 1 | 1.0 |
sample_sentences(SENTENCES.loc[bleak])
| Sentence | ID | Sentiment |
|---|---|---|
| 50 81 0 | 0.5 | |
| 30 70 6 | 0.0 | |
| 37 72 4 | -1.0 | |
| 29 69 1 | 1.0 | |
| 14 118 1 | 1.0 | |
| 5 5 0 | 1.0 | |
| 29 41 0 | 1.0 | |
| 14 26 1 | 1.0 | |
| 46 94 4 | 1.0 | |
| 15 99 0 | 1.0 |
sample_sentences(SENTENCES.loc[puddnhead])
| Sentence | ID | Sentiment |
|---|---|---|
| 4 26 0 | 1.0 | |
| 12 30 0 | 1.0 | |
| 2 19 1 | -1.0 | |
| 22 80 2 | 0.0 | |
| 4 7 0 | 1.0 | |
| 20 38 4 | 0.5 | |
| 14 59 1 | -1.0 | |
| 8 8 0 | -0.6667 | |
| 22 13 6 | 0.0 | |
| 11 5 3 | -1.0 |
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
SENTENCES['sent_str'] = COMBO.groupby(OHCO[:-1]).term_str.apply(lambda x: x.str.cat(sep=' '))
vader_cols = [f"vader_{col}" for col in "neg neu pos compound".split()]
SENTENCES[vader_cols] = SENTENCES.sent_str.apply(analyser.polarity_scores).apply(lambda x: pd.Series(x))
SENTENCES.sample(10)
| anger | anticipation | disgust | fear | joy | sadness | surprise | trust | polarity | html_str | sent_str | vader_neg | vader_neu | vader_pos | vader_compound | ||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| book_id | chap_id | para_num | sent_num | |||||||||||||||
| 968 | 25 | 150 | 1 | 0.000000 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1.00 | <spdan class='sent0'>theres</span> <spdan clas... | theres no harm in that i suppose | 0.588 | 0.412 | 0.000 | -0.6908 |
| 1400 | 3 | 32 | 0 | 0.000000 | 0.000000 | 0.0 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 0.000000 | 1.00 | <spdan class='sent0'>the</span> <spdan class='... | the man stopped eating and regarded me with th... | 0.086 | 0.498 | 0.416 | 0.8074 |
| 245 | 56 | 17 | 1 | 0.600000 | 0.400000 | 0.2 | 0.800000 | 0.000000 | 0.400000 | 0.400000 | 0.000000 | -0.60 | <spdan class='sent0'>wherever</span> <spdan cl... | wherever in the uttermost parts of the globe a... | 0.041 | 0.959 | 0.000 | -0.2960 |
| 3176 | 26 | 5 | 6 | 0.222222 | 0.444444 | 0.0 | 0.444444 | 0.222222 | 0.333333 | 0.111111 | 0.444444 | 0.00 | <spdan class='sent0'>they</span> <spdan class=... | they can work at any business they please they... | 0.040 | 0.874 | 0.086 | 0.8126 |
| 1023 | 52 | 47 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | <spdan class='sent0'>and</span> <spdan class='... | and it really was | 0.000 | 1.000 | 0.000 | 0.0000 |
| 564 | 17 | 142 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | <spdan class='sent0'>where</span> <spdan class... | where are you going mr tartar | 0.000 | 1.000 | 0.000 | 0.0000 |
| 580 | 25 | 6 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | <spdan class='sent0'>business</span> <spdan cl... | business first pleasure arterwards as king ric... | 0.163 | 0.712 | 0.125 | -0.0258 |
| 882 | 54 | 159 | 7 | 0.250000 | 0.500000 | 0.0 | 0.750000 | 0.000000 | 0.250000 | 0.000000 | 0.250000 | -0.25 | <spdan class='sent0'>and</span> <spdan class='... | and horace hunter took great credit to himself... | 0.037 | 0.756 | 0.207 | 0.8779 |
| 61522 | 3 | 132 | 3 | 1.000000 | 0.000000 | 0.0 | 1.000000 | 0.000000 | 0.000000 | 0.500000 | 0.000000 | 0.00 | <spdan class='sent0'>you</span> <spdan class='... | you walk carelessly towards the academy grove ... | 0.056 | 0.864 | 0.081 | 0.2263 |
| 3199 | 9 | 77 | 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | <spdan class='sent0'>in</span> <spdan class='s... | in company with nasby clemens that season also... | 0.000 | 1.000 | 0.000 | 0.0000 |
def vader_plot(novel_name):
global SENTENCES
X = SENTENCES.loc[novel_name]
w = int(len(X)/5)
fig, axes = plt.subplots(ncols=1, nrows=3, figsize=(25,20), sharex=True)
X[['vader_pos','vader_neg']].rolling(w).mean().plot(ax=axes[0], title=f'{LIB.loc[novel_name, "title"].title()} {novel_name}\npos, neg {w}');
X['vader_neu'].rolling(w).mean().plot(ax=axes[1], title=f'{LIB.loc[novel_name, "title"].title()} {novel_name}\nneu');
X['vader_compound'].rolling(w).mean().plot(ax=axes[2], title=f'{LIB.loc[novel_name, "title"].title()} {novel_name}\ncompound');
vader_plot(two_cities)
vader_plot(bleak)
vader_plot(sawyer)
vader_plot(rough)
FIG = dict(figsize=(12, 5), legend=True, fontsize=14, rot=45)
def compare_novels(novel_a, novel_b, w=10, emo='vader_compound'):
global SENTENCES, FIG
A = SENTENCES.loc[novel_a].reset_index(drop=True).reset_index().rename(columns=dict(index='seq'))
A['cut'] = pd.cut(A.seq, 100)
A1 = A.groupby('cut')[emo].mean().reset_index(drop=True)
B = SENTENCES.loc[novel_b].reset_index(drop=True).reset_index().rename(columns=dict(index='seq'))
B['cut'] = pd.cut(B.seq, 100)
B1 = B.groupby('cut')[emo].mean().reset_index(drop=True)
C = pd.concat([A1,B1], axis=1)
C.columns = [novel_a, novel_b]
plt = C.rolling(w).mean().plot(**FIG)
plt.set_title(f'{LIB.loc[novel_a, "title"].title()} ({novel_a}) by {LIB.loc[novel_a, "author"].title()} vs. {LIB.loc[novel_b, "title"].title()} by {LIB.loc[novel_b, "author"].title()} ({novel_b})',
fontsize = 14);
compare_novels(great_expectations, bleak)
compare_novels(great_expectations, puddnhead)
compare_novels(bleak, puddnhead)
# additional dickens novels for comparison
haunted = novels.dickens_the_haunted_man_and_the_ghosts_bargain_644
martin = novels.dickens_martin_chuzzlewit_968
curious = novels.dickens_the_old_curiosity_shop_700
dombey = novels.dickens_dombey_and_sons_821
# travel books
american_notes = novels.dickens_american_notes_675
italy = novels.dickens_pictures_from_italy_650
traveller = novels.dickens_the_uncommerical_traveller_914
# additional twain novels for comparison
carnival = novels.twain_the_facts_concerning_the_recent_carnival_of_crime_in_connecticut_3183
pauper = novels.twain_the_prince_and_the_pauper_1837
# travel books
claimant = novels.twain_the_american_claimant_3179
innocents = novels.twain_the_innocents_abroad_3176
tramp = novels.twain_a_tramp_abroad_119
equator = novels.twain_following_the_equator_2895
miss = novels.twain_life_on_the_mississippi_245
compare_novels(haunted, carnival) # eerier stories (??)
compare_novels(martin, claimant) # books about america
compare_novels(american_notes, claimant) # books about america
compare_novels(martin, puddnhead) # books about america
compare_novels(two_cities, pauper) # twins
# books involving young protagonists
dickens_compare = [great_expectations, twist, curious, copperfield, dombey]
twain_compare = [sawyer, huck]
novel_pairs = [i for i in itertools.combinations(dickens_compare + twain_compare, 2) if i[0] in dickens_compare and i[1] in twain_compare]
for pair in novel_pairs:
compare_novels(pair[0], pair[1])
dickens_travel = [traveller, american_notes, italy]
twain_travel = [rough, claimant, innocents, tramp, equator, miss]
travel_pairs = [i for i in itertools.combinations(dickens_travel + twain_travel, 2) if i[0] in dickens_travel and i[1] in twain_travel]
for pair in travel_pairs:
compare_novels(pair[0], pair[1])
compare_novels(two_cities, huck)
compare_novels(two_cities, sawyer)
LIB.to_csv('full_LIB.csv')
CORPUS.to_csv('full_CORPUS.csv')
M10_04_AustenMelville.ipynb by Professor Raf AlvaradoVADER in nltk: https://towardsdatascience.com/sentimental-analysis-using-vader-a3415fef7664.title() TypeError: ‘Text’ object is not callable with .set_title(): https://techoverflow.net/2021/04/04/how-to-fix-matplotlib-title-typeerror-text-object-is-not-callable/